Las variables son las siguientes:
data(Pima.tr2, package="MASS")
class(Pima.tr2)
## [1] "data.frame"
str(Pima.tr2)
## 'data.frame': 300 obs. of 8 variables:
## $ npreg: int 5 7 5 0 0 5 3 1 3 2 ...
## $ glu : int 86 195 77 165 107 97 83 193 142 128 ...
## $ bp : int 68 70 82 76 60 76 58 50 80 78 ...
## $ skin : int 28 33 41 43 25 27 31 16 15 37 ...
## $ bmi : num 30.2 25.1 35.8 47.9 26.4 35.6 34.3 25.9 32.4 43.3 ...
## $ ped : num 0.364 0.163 0.156 0.259 0.133 ...
## $ age : int 24 55 35 26 23 52 25 24 63 31 ...
## $ type : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 1 1 2 ...
glimpse(Pima.tr2)
## Observations: 300
## Variables: 8
## $ npreg <int> 5, 7, 5, 0, 0, 5, 3, 1, 3, 2, 0, 9, 1, 12, 1, 4, 1, 11, ...
## $ glu <int> 86, 195, 77, 165, 107, 97, 83, 193, 142, 128, 137, 154, ...
## $ bp <int> 68, 70, 82, 76, 60, 76, 58, 50, 80, 78, 40, 78, 60, 62, ...
## $ skin <int> 28, 33, 41, 43, 25, 27, 31, 16, 15, 37, 35, 30, 23, 7, 5...
## $ bmi <dbl> 30.2, 25.1, 35.8, 47.9, 26.4, 35.6, 34.3, 25.9, 32.4, 43...
## $ ped <dbl> 0.364, 0.163, 0.156, 0.259, 0.133, 0.378, 0.336, 0.655, ...
## $ age <int> 24, 55, 35, 26, 23, 52, 25, 24, 63, 31, 33, 45, 59, 44, ...
## $ type <fct> No, Yes, No, No, No, Yes, No, No, No, Yes, Yes, No, Yes,...
summary(Pima.tr2)
## npreg glu bp skin
## Min. : 0.000 Min. : 56.0 Min. : 38.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.:101.0 1st Qu.: 64.00 1st Qu.:21.00
## Median : 3.000 Median :121.0 Median : 72.00 Median :29.00
## Mean : 3.787 Mean :123.7 Mean : 72.32 Mean :29.15
## 3rd Qu.: 6.000 3rd Qu.:142.0 3rd Qu.: 80.00 3rd Qu.:36.00
## Max. :14.000 Max. :199.0 Max. :114.00 Max. :99.00
## NA's :13 NA's :98
## bmi ped age type
## Min. :18.20 Min. :0.0780 Min. :21.0 No :194
## 1st Qu.:27.10 1st Qu.:0.2367 1st Qu.:24.0 Yes:106
## Median :32.00 Median :0.3360 Median :29.0
## Mean :32.05 Mean :0.4357 Mean :33.1
## 3rd Qu.:36.50 3rd Qu.:0.5867 3rd Qu.:40.0
## Max. :52.90 Max. :2.2880 Max. :72.0
## NA's :3
h1 <- ggplot(Pima.tr2, aes(glu)) + geom_histogram()
h2 <- ggplot(Pima.tr2, aes(bp)) + geom_histogram()
h3 <- ggplot(Pima.tr2, aes(skin)) + geom_histogram()
h4 <- ggplot(Pima.tr2, aes(bmi)) + geom_histogram()
h5 <- ggplot(Pima.tr2, aes(ped)) + geom_histogram()
h6 <- ggplot(Pima.tr2, aes(age)) + geom_histogram()
grid.arrange (h1, h2, h3, h4, h5, h6, nrow=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 13 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 98 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
PimaV <- select(Pima.tr2, glu:age)
# para especificar como queremos que se vean las graficas.
par(mar=c(3.1, 4.1, 1.1, 2.1))
#scale estandariza los datos
# outlier se pone en rojo con la figura 16
boxplot(scale(PimaV), pch=16, outcol="red")
ggpairs(data=PimaV, diag=list(continuous='densityDiag'),
axisLabels='show')
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 13 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 98 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 3 rows containing missing values
## Warning: Removed 13 rows containing missing values (geom_point).
## Warning: Removed 13 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 99 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 16 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 13 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 13 rows containing missing values
## Warning: Removed 98 rows containing missing values (geom_point).
## Warning: Removed 99 rows containing missing values (geom_point).
## Warning: Removed 98 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 99 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 98 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 98 rows containing missing values
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 99 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_density).
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 3 rows containing missing values
## Warning in (function (data, mapping, alignPercent = 0.6, method =
## "pearson", : Removed 3 rows containing missing values
## Warning: Removed 13 rows containing missing values (geom_point).
## Warning: Removed 98 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 13 rows containing missing values (geom_point).
## Warning: Removed 98 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing missing values (geom_point).
La primera gráfica que se muestra es simplemente para ver las ditribuciones de las variables y su sesgo.
La segunda gráfica es para ver si la injerencia que cada una de las variables tienen sobre la variable de respuesta “type”.
En la segunda gráfica se puede observar que por sí solas las variables no influyen directamente en el resultado del pacience en cuanto a diabetes, todas las medianas se encuentran muy cercanas al cero.
La última gráfica sirve para la relación entre las variables independientes (correlación, en la diagonal su distribucion y en la diagonal inferior como se comportan los datos)
Estas gráficas no sirven para dar información sobre el efecto que las variables independientes tienen sobre la variable objetivo.
library(GDAdata)
data(btw2009, package = "flexclust")
class(btw2009)
## [1] "data.frame"
str(btw2009)
## 'data.frame': 299 obs. of 17 variables:
## $ state : Factor w/ 16 levels "Baden-Wuerttemberg",..: 15 15 15 15 15 15 15 15 15 15 ...
## $ eligible: int 225216 187047 177604 198910 200347 175132 232899 240010 179987 237144 ...
## $ votes : int 163329 134671 128320 150018 145483 127971 176188 180295 131917 181568 ...
## $ invalid1: int 4117 3382 3348 3404 3442 3292 3384 3674 4043 3884 ...
## $ invalid2: int 4181 3218 3030 3026 2263 3031 3193 3303 3094 3259 ...
## $ valid1 : int 159212 131289 124972 146614 142041 124679 172804 176621 127874 177684 ...
## $ valid2 : int 159148 131453 125290 146992 143220 124940 172995 176992 128823 178309 ...
## $ SPD1 : int 52139 36642 37277 47610 54398 41683 54950 54065 43765 53073 ...
## $ SPD2 : int 41793 32239 31282 39253 42369 35557 45540 44445 36329 45354 ...
## $ UNION1 : int 61793 56751 49779 58876 42733 48136 70458 70290 49363 70874 ...
## $ UNION2 : int 51068 47947 43480 49789 36397 40110 57203 58225 42193 59075 ...
## $ GRUENE1 : int 16399 12840 10665 14790 18699 12187 16143 16633 10411 18761 ...
## $ GRUENE2 : int 21967 14927 12899 18751 24659 15701 20978 21245 13651 22240 ...
## $ FDP1 : int 15292 15184 16037 14317 12188 12515 17018 20084 14397 19742 ...
## $ FDP2 : int 24187 22605 21970 23442 19156 19772 28640 31407 22701 30714 ...
## $ LINKE1 : int 11918 8208 8896 8729 11817 7871 11794 12527 8235 12351 ...
## $ LINKE2 : int 13481 8836 9808 9821 13430 8936 13412 14280 9342 13855 ...
glimpse(btw2009)
## Observations: 299
## Variables: 17
## $ state <fct> Schleswig-Holstein, Schleswig-Holstein, Schleswig-Hol...
## $ eligible <int> 225216, 187047, 177604, 198910, 200347, 175132, 23289...
## $ votes <int> 163329, 134671, 128320, 150018, 145483, 127971, 17618...
## $ invalid1 <int> 4117, 3382, 3348, 3404, 3442, 3292, 3384, 3674, 4043,...
## $ invalid2 <int> 4181, 3218, 3030, 3026, 2263, 3031, 3193, 3303, 3094,...
## $ valid1 <int> 159212, 131289, 124972, 146614, 142041, 124679, 17280...
## $ valid2 <int> 159148, 131453, 125290, 146992, 143220, 124940, 17299...
## $ SPD1 <int> 52139, 36642, 37277, 47610, 54398, 41683, 54950, 5406...
## $ SPD2 <int> 41793, 32239, 31282, 39253, 42369, 35557, 45540, 4444...
## $ UNION1 <int> 61793, 56751, 49779, 58876, 42733, 48136, 70458, 7029...
## $ UNION2 <int> 51068, 47947, 43480, 49789, 36397, 40110, 57203, 5822...
## $ GRUENE1 <int> 16399, 12840, 10665, 14790, 18699, 12187, 16143, 1663...
## $ GRUENE2 <int> 21967, 14927, 12899, 18751, 24659, 15701, 20978, 2124...
## $ FDP1 <int> 15292, 15184, 16037, 14317, 12188, 12515, 17018, 2008...
## $ FDP2 <int> 24187, 22605, 21970, 23442, 19156, 19772, 28640, 3140...
## $ LINKE1 <int> 11918, 8208, 8896, 8729, 11817, 7871, 11794, 12527, 8...
## $ LINKE2 <int> 13481, 8836, 9808, 9821, 13430, 8936, 13412, 14280, 9...
summary(btw2009)
## state eligible votes invalid1
## Nordrhein-Westfalen:64 Min. :154767 Min. : 94922 Min. :1198
## Bayern :45 1st Qu.:190016 1st Qu.:133038 1st Qu.:1888
## Baden-Wuerttemberg :38 Median :207451 Median :146160 Median :2374
## Niedersachsen :30 Mean :207921 Mean :147176 Mean :2534
## Hessen :21 3rd Qu.:224707 3rd Qu.:160939 3rd Qu.:3098
## Sachsen :16 Max. :256131 Max. :191097 Max. :5598
## (Other) :85
## invalid2 valid1 valid2 SPD1
## Min. : 974 Min. : 93398 Min. : 93873 Min. :14040
## 1st Qu.:1552 1st Qu.:130464 1st Qu.:130780 1st Qu.:28807
## Median :1971 Median :143231 Median :143546 Median :40464
## Mean :2122 Mean :144642 Mean :145054 Mean :40401
## 3rd Qu.:2564 3rd Qu.:158170 3rd Qu.:158481 3rd Qu.:50988
## Max. :4959 Max. :188127 Max. :188728 Max. :73215
##
## SPD2 UNION1 UNION2 GRUENE1
## Min. :13611 Min. : 18394 Min. :18788 Min. : 3725
## 1st Qu.:25481 1st Qu.: 46464 1st Qu.:40868 1st Qu.: 9008
## Median :32691 Median : 56503 Median :49267 Median :12190
## Mean :33413 Mean : 57016 Mean :49025 Mean :13436
## 3rd Qu.:40946 3rd Qu.: 66090 3rd Qu.:55608 3rd Qu.:16142
## Max. :59493 Max. :101261 Max. :86686 Max. :73897
## NA's :3
## GRUENE2 FDP1 FDP2 LINKE1
## Min. : 4238 Min. : 4885 Min. : 7872 Min. : 4868
## 1st Qu.:10906 1st Qu.:10665 1st Qu.:16805 1st Qu.: 9066
## Median :14609 Median :12982 Median :20829 Median :10936
## Mean :15529 Mean :13634 Mean :21124 Mean :16132
## 3rd Qu.:18894 3rd Qu.:15970 3rd Qu.:25014 3rd Qu.:14083
## Max. :43347 Max. :31606 Max. :38125 Max. :62880
## NA's :2
## LINKE2
## Min. : 5641
## 1st Qu.:10406
## Median :12596
## Mean :17244
## 3rd Qu.:16266
## Max. :56938
##
btw2009 <- within(btw2009, Linke2 <- 100*LINKE2/valid2)
ggplot(btw2009, aes(Linke2)) +
geom_histogram(fill = "mediumpurple",binwidth = 1) +
ylab("") +
xlab("Percentage voter support for Die Linke in 2009")
Información general
¿de qué se trata?
Die Linke es un partido político alemán, los datos muestran el porcentaje de apoyo a este partido político en 2009, representado por el número de votos.
tipo de cada variable
Las variables son el número de votos a cada partido político por estado
¿qué se puede apreciar con cada gráfica?
No está muy clara la gráfica, el eje de las x no da mucha información
¿cuál crees que sea su objetivo?
Ver el porcentaje de aprobación del partido por estado-ciudad.
Da tus conclusiones sobre lo que se percibe con cada gráfica
Se puede apreciar que existen dos distribuciones de los datos
library(UsingR)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
##
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
##
## cancer
data(galton, package="UsingR")
class(galton)
## [1] "data.frame"
str(galton)
## 'data.frame': 928 obs. of 2 variables:
## $ child : num 61.7 61.7 61.7 61.7 61.7 62.2 62.2 62.2 62.2 62.2 ...
## $ parent: num 70.5 68.5 65.5 64.5 64 67.5 67.5 67.5 66.5 66.5 ...
glimpse(galton)
## Observations: 928
## Variables: 2
## $ child <dbl> 61.7, 61.7, 61.7, 61.7, 61.7, 62.2, 62.2, 62.2, 62.2, 6...
## $ parent <dbl> 70.5, 68.5, 65.5, 64.5, 64.0, 67.5, 67.5, 67.5, 66.5, 6...
summary(galton)
## child parent
## Min. :61.70 Min. :64.00
## 1st Qu.:66.20 1st Qu.:67.50
## Median :68.20 Median :68.50
## Mean :68.09 Mean :68.31
## 3rd Qu.:70.20 3rd Qu.:69.50
## Max. :73.70 Max. :73.00
ht <- "height (in)"
par(mfrow=c(1,2), las=1, mar=c(3.1, 4.1, 1.1, 2.1))
with(galton, {
hist(child, xlab=ht, main="Children", col="green")
hist(parent, xlab=ht, main="Parents", col="blue")})
par(mfrow=c(1,2), las=1, mar=c(3.1, 4.1, 1.1, 2.1))
with(galton, {
MASS::truehist(child, h=0.1)
MASS::truehist(parent, h=0.1)})
c1 <- ggplot(galton, aes(child)) + geom_histogram(binwidth=1) +
xlim(60, 75) + ylim(0, 225) + ylab("") +
geom_vline(xintercept=median(galton$child),
col="red")
p1 <- ggplot(galton, aes(parent)) + geom_histogram(binwidth=1) +
xlim(60, 75) + ylim(0, 225) + ylab("") +
geom_vline(xintercept=median(galton$parent),col="red")
grid.arrange(c1, p1)
Información general
¿de qué se trata?
Data set from tabulated data set used by Galton in 1885 to study the relationship between a parent’s height and their childrens..
tipo de cada variable
Las variables son continuas y se refieren a la altura del padre y del hijo
¿qué se puede apreciar con cada gráfica?
En la primera gráfica se aprecian dos histogramas tanto de la altura del padre como la altura del hijo También en la segunda pero con distintos parámetros gráficos
¿cuál crees que sea su objetivo?
Ver las diferencia en altura entre los padres y los hijos.
Da tus conclusiones sobre lo que se percibe con cada gráfica
Las gráficas parecen mostrar que no existe diferencia entre la altura promedio de los hijos y de los padres
data(father.son, package="UsingR")
class(father.son)
## [1] "data.frame"
str(father.son)
## 'data.frame': 1078 obs. of 2 variables:
## $ fheight: num 65 63.3 65 65.8 61.1 ...
## $ sheight: num 59.8 63.2 63.3 62.8 64.3 ...
glimpse(father.son)
## Observations: 1,078
## Variables: 2
## $ fheight <dbl> 65.04851, 63.25094, 64.95532, 65.75250, 61.13723, 63.0...
## $ sheight <dbl> 59.77827, 63.21404, 63.34242, 62.79238, 64.28113, 64.2...
summary(father.son)
## fheight sheight
## Min. :59.01 Min. :58.51
## 1st Qu.:65.79 1st Qu.:66.93
## Median :67.77 Median :68.62
## Mean :67.69 Mean :68.68
## 3rd Qu.:69.60 3rd Qu.:70.47
## Max. :75.43 Max. :78.36
c2 <- ggplot(father.son, aes(sheight)) +
geom_histogram(aes(y = ..density..), binwidth=1) +
geom_density() + xlim(58, 80) + ylim(0, 0.16) +
xlab("ht (inches)") + ylab("") + ggtitle("Sons")
p2 <- ggplot(father.son, aes(fheight)) +
geom_histogram(aes(y = ..density..), binwidth=1) +
geom_density() + xlim(58, 80) + ylim(0, 0.16) +
xlab("ht (inches)") + ylab("") +
ggtitle("Fathers")
grid.arrange(c2, p2, nrow = 1)
par(mfrow=c(1,2), las=1, mar=c(3.1, 4.1, 1.1, 2.1))
with(father.son, {
qqnorm(sheight, main="Sons", xlab="",
ylab="", pch=16, ylim=c(55,80))
qqline(sheight)
qqnorm(fheight, main="Fathers", xlab="",
ylab="", pch=16, ylim=c(55,80))
qqline(fheight)})
shapiro.test(father.son$fheight)
##
## Shapiro-Wilk normality test
##
## data: father.son$fheight
## W = 0.99791, p-value = 0.1962
shapiro.test(father.son$sheight)
##
## Shapiro-Wilk normality test
##
## data: father.son$sheight
## W = 0.99642, p-value = 0.0142
Información general
¿de qué se trata?
Measurements of a father’s height and his son’s height
tipo de cada variable
Las variables son continuas y se refieren a la altura del padre y del hijo
¿qué se puede apreciar con cada gráfica?
En la primera gráfica se aprecian dos histogramas tanto de la altura del padre como la altura del hijo En la segunda gráfica se aprecia una gráfica de ajuste a una distribución normal y se percibe que ambas variables se ajustan muy bien a esta distribución
¿cuál crees que sea su objetivo?
Ver las diferencia en altura entre los padres y los hijos y probar el ajuste de ambas distribuciones a la distribución normal.
Da tus conclusiones sobre lo que se percibe con cada gráfica
Las gráficas parecen mostrar que ambas variables se distribuyen normal. De igual forma la prueba de Shapiro
class(MASS::hills)
## [1] "data.frame"
str(MASS::hills)
## 'data.frame': 35 obs. of 3 variables:
## $ dist : num 2.5 6 6 7.5 8 8 16 6 5 6 ...
## $ climb: int 650 2500 900 800 3070 2866 7500 800 800 650 ...
## $ time : num 16.1 48.4 33.6 45.6 62.3 ...
glimpse(MASS::hills)
## Observations: 35
## Variables: 3
## $ dist <dbl> 2.5, 6.0, 6.0, 7.5, 8.0, 8.0, 16.0, 6.0, 5.0, 6.0, 28.0,...
## $ climb <int> 650, 2500, 900, 800, 3070, 2866, 7500, 800, 800, 650, 21...
## $ time <dbl> 16.083, 48.350, 33.650, 45.600, 62.267, 73.217, 204.617,...
summary(MASS::hills)
## dist climb time
## Min. : 2.000 Min. : 300 Min. : 15.95
## 1st Qu.: 4.500 1st Qu.: 725 1st Qu.: 28.00
## Median : 6.000 Median :1000 Median : 39.75
## Mean : 7.529 Mean :1815 Mean : 57.88
## 3rd Qu.: 8.000 3rd Qu.:2200 3rd Qu.: 68.62
## Max. :28.000 Max. :7500 Max. :204.62
par(mfrow=c(1,1), mar=c(3.1, 4.1, 1.1, 2.1))
with(MASS::hills,
boxplot(time, horizontal=TRUE, pch=16, ylim=c(0, 220)))
with(MASS::hills, {
MASS::truehist(time)})
with(MASS::hills, {
hist(time)})
ggplot(MASS::hills, aes(time)) +
geom_histogram(binwidth = 10)
Información general
¿de qué se trata?
The record times in 1984 for 35 Scottish hill races.
tipo de cada variable
dist: distance in miles (continua)
climb: total height gained during the route, in feet (discreta).
time: record time in hours (continua)
¿qué se puede apreciar con cada gráfica?
En la primera gráfica de boxplot se puede apreciar que la gráfica está sesgada a las izquierda, su mediana está a la izquerda. Existen cuatro puntos “outliers” muy extraños del tiempo que vale la pena observar
¿cuál crees que sea su objetivo?
No lo tengo muy claro, supongo que llevar un estadístico de este deporte.
Da tus conclusiones sobre lo que se percibe con cada gráfica
Únicamente se puede concluir el sesgo hacia la izquierda de los datos. Es extraño el sesgo a la izquierda.
# Boston Housing
class(MASS::Boston)
## [1] "data.frame"
str(MASS::Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
glimpse(MASS::Boston)
## Observations: 506
## Variables: 14
## $ crim <dbl> 0.00632, 0.02731, 0.02729, 0.03237, 0.06905, 0.02985, ...
## $ zn <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5,...
## $ indus <dbl> 2.31, 7.07, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, ...
## $ chas <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ nox <dbl> 0.538, 0.469, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524...
## $ rm <dbl> 6.575, 6.421, 7.185, 6.998, 7.147, 6.430, 6.012, 6.172...
## $ age <dbl> 65.2, 78.9, 61.1, 45.8, 54.2, 58.7, 66.6, 96.1, 100.0,...
## $ dis <dbl> 4.0900, 4.9671, 4.9671, 6.0622, 6.0622, 6.0622, 5.5605...
## $ rad <int> 1, 2, 2, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, ...
## $ tax <dbl> 296, 242, 242, 222, 222, 222, 311, 311, 311, 311, 311,...
## $ ptratio <dbl> 15.3, 17.8, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, ...
## $ black <dbl> 396.90, 396.90, 392.83, 394.63, 396.90, 394.12, 395.60...
## $ lstat <dbl> 4.98, 9.14, 4.03, 2.94, 5.33, 5.21, 12.43, 19.15, 29.9...
## $ medv <dbl> 24.0, 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, ...
summary(MASS::Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
ggplot(MASS::Boston, aes(medv)) + geom_histogram() + ylab("") +
xlab("Median housing value (thousands of dollars)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(MASS)
with(MASS::Boston, {truehist(medv, h=1)})
library(tidyr)
B2 <- gather(MASS::Boston, BosVars, BosValues, crim:medv)
ggplot(B2, aes(BosValues)) + geom_histogram() + xlab("") +
ylab("") + facet_wrap(~ BosVars, scales = "free")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(MASS)
with(Boston, hist(ptratio))
with(Boston, truehist(ptratio))
boxplot(Boston$medv, pch=16)
stripchart(Boston$medv, method="jitter", pch=16)
stem(Boston$medv)
##
## The decimal point is at the |
##
## 4 | 006
## 6 | 30022245
## 8 | 1334455788567
## 10 | 2224455899035778899
## 12 | 013567778011112333444455668888899
## 14 | 0111233445556689990001222344666667
## 16 | 01112234556677880111222344455567888889
## 18 | 01222334445555667778899990011112233333444444555566666778889999
## 20 | 0000011111223333444455566666677888990001122222444445566777777788999
## 22 | 00000001222223344555666667788889999000011111112222333344566777788889
## 24 | 001112333444455566777888800000000123
## 26 | 24456667011555599
## 28 | 01244567770011466889
## 30 | 111357801255667
## 32 | 0024579011223448
## 34 | 679991244
## 36 | 01224502369
## 38 | 78
## 40 | 37
## 42 | 38158
## 44 | 084
## 46 | 07
## 48 | 358
## 50 | 0000000000000000
library(ash)
plot(ash1(bin1(Boston$medv, nbin=50)), type="l")
## [1] "ash estimate nonzero outside interval ab"
d1 <- density(Boston$medv)
plot(d1, ylim=c(0,0.08))
rug(Boston$medv)
lines(density(Boston$medv, d1$bw/2), col="green")
lines(density(Boston$medv, d1$bw/5), col="blue")
data(btw2009, package = "flexclust")
btw2009 <- within(btw2009, Bundesland <- state)
levels(btw2009$state)
## [1] "Baden-Wuerttemberg" "Bayern"
## [3] "Berlin" "Brandenburg"
## [5] "Bremen" "Hamburg"
## [7] "Hessen" "Mecklenburg-Vorpommern"
## [9] "Niedersachsen" "Nordrhein-Westfalen"
## [11] "Rheinland-Pfalz" "Saarland"
## [13] "Sachsen" "Sachsen-Anhalt"
## [15] "Schleswig-Holstein" "Thueringen"
class(btw2009)
## [1] "data.frame"
str(btw2009)
## 'data.frame': 299 obs. of 18 variables:
## $ state : Factor w/ 16 levels "Baden-Wuerttemberg",..: 15 15 15 15 15 15 15 15 15 15 ...
## $ eligible : int 225216 187047 177604 198910 200347 175132 232899 240010 179987 237144 ...
## $ votes : int 163329 134671 128320 150018 145483 127971 176188 180295 131917 181568 ...
## $ invalid1 : int 4117 3382 3348 3404 3442 3292 3384 3674 4043 3884 ...
## $ invalid2 : int 4181 3218 3030 3026 2263 3031 3193 3303 3094 3259 ...
## $ valid1 : int 159212 131289 124972 146614 142041 124679 172804 176621 127874 177684 ...
## $ valid2 : int 159148 131453 125290 146992 143220 124940 172995 176992 128823 178309 ...
## $ SPD1 : int 52139 36642 37277 47610 54398 41683 54950 54065 43765 53073 ...
## $ SPD2 : int 41793 32239 31282 39253 42369 35557 45540 44445 36329 45354 ...
## $ UNION1 : int 61793 56751 49779 58876 42733 48136 70458 70290 49363 70874 ...
## $ UNION2 : int 51068 47947 43480 49789 36397 40110 57203 58225 42193 59075 ...
## $ GRUENE1 : int 16399 12840 10665 14790 18699 12187 16143 16633 10411 18761 ...
## $ GRUENE2 : int 21967 14927 12899 18751 24659 15701 20978 21245 13651 22240 ...
## $ FDP1 : int 15292 15184 16037 14317 12188 12515 17018 20084 14397 19742 ...
## $ FDP2 : int 24187 22605 21970 23442 19156 19772 28640 31407 22701 30714 ...
## $ LINKE1 : int 11918 8208 8896 8729 11817 7871 11794 12527 8235 12351 ...
## $ LINKE2 : int 13481 8836 9808 9821 13430 8936 13412 14280 9342 13855 ...
## $ Bundesland: Factor w/ 16 levels "Baden-Wuerttemberg",..: 15 15 15 15 15 15 15 15 15 15 ...
glimpse(btw2009)
## Observations: 299
## Variables: 18
## $ state <fct> Schleswig-Holstein, Schleswig-Holstein, Schleswig-H...
## $ eligible <int> 225216, 187047, 177604, 198910, 200347, 175132, 232...
## $ votes <int> 163329, 134671, 128320, 150018, 145483, 127971, 176...
## $ invalid1 <int> 4117, 3382, 3348, 3404, 3442, 3292, 3384, 3674, 404...
## $ invalid2 <int> 4181, 3218, 3030, 3026, 2263, 3031, 3193, 3303, 309...
## $ valid1 <int> 159212, 131289, 124972, 146614, 142041, 124679, 172...
## $ valid2 <int> 159148, 131453, 125290, 146992, 143220, 124940, 172...
## $ SPD1 <int> 52139, 36642, 37277, 47610, 54398, 41683, 54950, 54...
## $ SPD2 <int> 41793, 32239, 31282, 39253, 42369, 35557, 45540, 44...
## $ UNION1 <int> 61793, 56751, 49779, 58876, 42733, 48136, 70458, 70...
## $ UNION2 <int> 51068, 47947, 43480, 49789, 36397, 40110, 57203, 58...
## $ GRUENE1 <int> 16399, 12840, 10665, 14790, 18699, 12187, 16143, 16...
## $ GRUENE2 <int> 21967, 14927, 12899, 18751, 24659, 15701, 20978, 21...
## $ FDP1 <int> 15292, 15184, 16037, 14317, 12188, 12515, 17018, 20...
## $ FDP2 <int> 24187, 22605, 21970, 23442, 19156, 19772, 28640, 31...
## $ LINKE1 <int> 11918, 8208, 8896, 8729, 11817, 7871, 11794, 12527,...
## $ LINKE2 <int> 13481, 8836, 9808, 9821, 13430, 8936, 13412, 14280,...
## $ Bundesland <fct> Schleswig-Holstein, Schleswig-Holstein, Schleswig-H...
summary(btw2009)
## state eligible votes invalid1
## Nordrhein-Westfalen:64 Min. :154767 Min. : 94922 Min. :1198
## Bayern :45 1st Qu.:190016 1st Qu.:133038 1st Qu.:1888
## Baden-Wuerttemberg :38 Median :207451 Median :146160 Median :2374
## Niedersachsen :30 Mean :207921 Mean :147176 Mean :2534
## Hessen :21 3rd Qu.:224707 3rd Qu.:160939 3rd Qu.:3098
## Sachsen :16 Max. :256131 Max. :191097 Max. :5598
## (Other) :85
## invalid2 valid1 valid2 SPD1
## Min. : 974 Min. : 93398 Min. : 93873 Min. :14040
## 1st Qu.:1552 1st Qu.:130464 1st Qu.:130780 1st Qu.:28807
## Median :1971 Median :143231 Median :143546 Median :40464
## Mean :2122 Mean :144642 Mean :145054 Mean :40401
## 3rd Qu.:2564 3rd Qu.:158170 3rd Qu.:158481 3rd Qu.:50988
## Max. :4959 Max. :188127 Max. :188728 Max. :73215
##
## SPD2 UNION1 UNION2 GRUENE1
## Min. :13611 Min. : 18394 Min. :18788 Min. : 3725
## 1st Qu.:25481 1st Qu.: 46464 1st Qu.:40868 1st Qu.: 9008
## Median :32691 Median : 56503 Median :49267 Median :12190
## Mean :33413 Mean : 57016 Mean :49025 Mean :13436
## 3rd Qu.:40946 3rd Qu.: 66090 3rd Qu.:55608 3rd Qu.:16142
## Max. :59493 Max. :101261 Max. :86686 Max. :73897
## NA's :3
## GRUENE2 FDP1 FDP2 LINKE1
## Min. : 4238 Min. : 4885 Min. : 7872 Min. : 4868
## 1st Qu.:10906 1st Qu.:10665 1st Qu.:16805 1st Qu.: 9066
## Median :14609 Median :12982 Median :20829 Median :10936
## Mean :15529 Mean :13634 Mean :21124 Mean :16132
## 3rd Qu.:18894 3rd Qu.:15970 3rd Qu.:25014 3rd Qu.:14083
## Max. :43347 Max. :31606 Max. :38125 Max. :62880
## NA's :2
## LINKE2 Bundesland
## Min. : 5641 Nordrhein-Westfalen:64
## 1st Qu.:10406 Bayern :45
## Median :12596 Baden-Wuerttemberg :38
## Mean :17244 Niedersachsen :30
## 3rd Qu.:16266 Hessen :21
## Max. :56938 Sachsen :16
## (Other) :85
btw2009 <- within (btw2009, stateA <- state)
btw2009 <- within (btw2009,
levels(stateA) <- c("BW", "BY", "BE",
"BB", "HB", "HH",
"HE", "MV", "NI",
"NW", "RP", "SL",
"SN", "ST", "SH", "TH"))
ggplot(btw2009, aes(state, LINKE2)) + geom_boxplot(varwidth=TRUE) + ylab("") +
theme(axis.text.x=element_text(angle=90,hjust=1))
Voters <- with(btw2009, size <- tapply(eligible, stateA, sum))
Voters
## BW BY BE BB HB HH HE MV
## 7633818 9382583 2471665 2128715 487978 1256634 4398919 1400298
## NI NW RP SL SN ST SH TH
## 6112110 13288291 3103878 808554 3518195 2028572 2234720 1913559
Bundesland <- rownames(Voters)
btw9s <- data.frame(Bundesland, Voters)
btw9s$EW <- c("West")
btw9s[c("BB", "BE", "MV", "SN", "ST", "TH"), "EW"] <- "East"
ls <- with(btw9s, Bundesland[order(EW, -Voters)])
ls
## [1] SN BE BB ST TH MV NW BY BW NI HE RP SH HH SL HB
## Levels: BB BE BW BY HB HE HH MV NI NW RP SH SL SN ST TH
btw9s <- within(btw9s, State1 <- factor(Bundesland, levels=ls))
b1 <- ggplot(btw9s, aes(Bundesland, Voters/1000000)) +
geom_bar(stat="identity") +
ylab("Voters (millions)")
b2 <- ggplot(btw9s, aes(reorder(Bundesland, -Voters),Voters/1000000)) +
geom_bar(stat="identity") +
xlab("Bundesland") +
ylab("Vo (millions)")
b3 <- ggplot(btw9s, aes(State1, Voters/1000000)) +
geom_bar(stat="identity") +
xlab("Bundesland") +
ylab("Vo (millions)")
grid.arrange(b1, b2, b3)
# Anorexia
data(anorexia, package="MASS")
class(anorexia)
## [1] "data.frame"
str(anorexia)
## 'data.frame': 72 obs. of 3 variables:
## $ Treat : Factor w/ 3 levels "CBT","Cont","FT": 2 2 2 2 2 2 2 2 2 2 ...
## $ Prewt : num 80.7 89.4 91.8 74 78.1 88.3 87.3 75.1 80.6 78.4 ...
## $ Postwt: num 80.2 80.1 86.4 86.3 76.1 78.1 75.1 86.7 73.5 84.6 ...
glimpse(anorexia)
## Observations: 72
## Variables: 3
## $ Treat <fct> Cont, Cont, Cont, Cont, Cont, Cont, Cont, Cont, Cont, C...
## $ Prewt <dbl> 80.7, 89.4, 91.8, 74.0, 78.1, 88.3, 87.3, 75.1, 80.6, 7...
## $ Postwt <dbl> 80.2, 80.1, 86.4, 86.3, 76.1, 78.1, 75.1, 86.7, 73.5, 8...
summary(anorexia)
## Treat Prewt Postwt
## CBT :29 Min. :70.00 Min. : 71.30
## Cont:26 1st Qu.:79.60 1st Qu.: 79.33
## FT :17 Median :82.30 Median : 84.05
## Mean :82.41 Mean : 85.17
## 3rd Qu.:86.00 3rd Qu.: 91.55
## Max. :94.90 Max. :103.60
ggplot(anorexia, aes(Treat)) + geom_bar() + xlab("Treatment")
with(anorexia, table(Treat))
## Treat
## CBT Cont FT
## 29 26 17
class(Titanic)
## [1] "table"
Titanic1 <- data.frame(Titanic)
class(Titanic1)
## [1] "data.frame"
str(Titanic1)
## 'data.frame': 32 obs. of 5 variables:
## $ Class : Factor w/ 4 levels "1st","2nd","3rd",..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Sex : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 1 1 ...
## $ Age : Factor w/ 2 levels "Child","Adult": 1 1 1 1 1 1 1 1 2 2 ...
## $ Survived: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Freq : num 0 0 35 0 0 0 17 0 118 154 ...
glimpse(Titanic1)
## Observations: 32
## Variables: 5
## $ Class <fct> 1st, 2nd, 3rd, Crew, 1st, 2nd, 3rd, Crew, 1st, 2nd, 3...
## $ Sex <fct> Male, Male, Male, Male, Female, Female, Female, Femal...
## $ Age <fct> Child, Child, Child, Child, Child, Child, Child, Chil...
## $ Survived <fct> No, No, No, No, No, No, No, No, No, No, No, No, No, N...
## $ Freq <dbl> 0, 0, 35, 0, 0, 0, 17, 0, 118, 154, 387, 670, 4, 13, ...
summary(Titanic1)
## Class Sex Age Survived Freq
## 1st :8 Male :16 Child:16 No :16 Min. : 0.00
## 2nd :8 Female:16 Adult:16 Yes:16 1st Qu.: 0.75
## 3rd :8 Median : 13.50
## Crew:8 Mean : 68.78
## 3rd Qu.: 77.00
## Max. :670.00
p <- ggplot(Titanic1, aes(weight=Freq)) +
ylab("") + ylim(0,2250)
cs <- p + aes(Class) + geom_bar(fill="blue")
sx <- p + aes(Sex) + geom_bar(fill="green")
ag <- p + aes(Age) + geom_bar(fill="tan2")
su <- p + aes(Survived) + geom_bar(fill="red")
grid.arrange(cs, sx, ag, su, nrow=1, widths=c(3, 2, 2, 2))
Party <- c("Fine Gael", "La", "Fianna Fail",
"Sinn Fein", "In", "Green", "Don’t know")
nos <- c(181, 51, 171, 119, 91, 4, 368)
IrOP <- data.frame(Party, nos)
IrOP <- within(IrOP, {
percwith <- nos/sum(nos)
percnot <- nos/sum(nos[-7])})
IrOP
## Party nos percnot percwith
## 1 Fine Gael 181 0.293354943 0.183756345
## 2 La 51 0.082658023 0.051776650
## 3 Fianna Fail 171 0.277147488 0.173604061
## 4 Sinn Fein 119 0.192868720 0.120812183
## 5 In 91 0.147487844 0.092385787
## 6 Green 4 0.006482982 0.004060914
## 7 Don’t know 368 0.596434360 0.373604061
par(mfrow=c(2,1), mar = c(2.1, 2.1, 2.1, 2.1))
with(IrOP, pie(percwith, labels=Party, clockwise=TRUE,
col=c("blue", "red", "darkgreen", "black",
"grey", "lightgreen", "white"), radius=1))
with(IrOP, pie(percnot[-7], labels=Party, clockwise=TRUE,
col=c("blue", "red", "darkgreen", "black",
"grey", "lightgreen"), radius=1))
#Valores en pie
# Pie Chart from data frame with Appended Sample Sizes
mytable <- table(iris$Species)
lbls <- paste(names(mytable), "\n", mytable, sep="")
par(mfrow=c(1,1), mar = c(2.1, 2.1, 2.1, 2.1))
pie(mytable, labels = lbls,
main="Pie Chart of Species\n (with sample sizes)")
# No son recomendados ?pie
data(survey, package="MASS")
class(survey)
## [1] "data.frame"
str(survey)
## 'data.frame': 237 obs. of 12 variables:
## $ Sex : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 1 2 1 2 2 ...
## $ Wr.Hnd: num 18.5 19.5 18 18.8 20 18 17.7 17 20 18.5 ...
## $ NW.Hnd: num 18 20.5 13.3 18.9 20 17.7 17.7 17.3 19.5 18.5 ...
## $ W.Hnd : Factor w/ 2 levels "Left","Right": 2 1 2 2 2 2 2 2 2 2 ...
## $ Fold : Factor w/ 3 levels "L on R","Neither",..: 3 3 1 3 2 1 1 3 3 3 ...
## $ Pulse : int 92 104 87 NA 35 64 83 74 72 90 ...
## $ Clap : Factor w/ 3 levels "Left","Neither",..: 1 1 2 2 3 3 3 3 3 3 ...
## $ Exer : Factor w/ 3 levels "Freq","None",..: 3 2 2 2 3 3 1 1 3 3 ...
## $ Smoke : Factor w/ 4 levels "Heavy","Never",..: 2 4 3 2 2 2 2 2 2 2 ...
## $ Height: num 173 178 NA 160 165 ...
## $ M.I : Factor w/ 2 levels "Imperial","Metric": 2 1 NA 2 2 1 1 2 2 2 ...
## $ Age : num 18.2 17.6 16.9 20.3 23.7 ...
glimpse(survey)
## Observations: 237
## Variables: 12
## $ Sex <fct> Female, Male, Male, Male, Male, Female, Male, Female, M...
## $ Wr.Hnd <dbl> 18.5, 19.5, 18.0, 18.8, 20.0, 18.0, 17.7, 17.0, 20.0, 1...
## $ NW.Hnd <dbl> 18.0, 20.5, 13.3, 18.9, 20.0, 17.7, 17.7, 17.3, 19.5, 1...
## $ W.Hnd <fct> Right, Left, Right, Right, Right, Right, Right, Right, ...
## $ Fold <fct> R on L, R on L, L on R, R on L, Neither, L on R, L on R...
## $ Pulse <int> 92, 104, 87, NA, 35, 64, 83, 74, 72, 90, 80, 68, NA, 66...
## $ Clap <fct> Left, Left, Neither, Neither, Right, Right, Right, Righ...
## $ Exer <fct> Some, None, None, None, Some, Some, Freq, Freq, Some, S...
## $ Smoke <fct> Never, Regul, Occas, Never, Never, Never, Never, Never,...
## $ Height <dbl> 173.00, 177.80, NA, 160.00, 165.00, 172.72, 182.88, 157...
## $ M.I <fct> Metric, Imperial, NA, Metric, Metric, Imperial, Imperia...
## $ Age <dbl> 18.250, 17.583, 16.917, 20.333, 23.667, 21.000, 18.833,...
summary(survey)
## Sex Wr.Hnd NW.Hnd W.Hnd Fold
## Female:118 Min. :13.00 Min. :12.50 Left : 18 L on R : 99
## Male :118 1st Qu.:17.50 1st Qu.:17.50 Right:218 Neither: 18
## NA's : 1 Median :18.50 Median :18.50 NA's : 1 R on L :120
## Mean :18.67 Mean :18.58
## 3rd Qu.:19.80 3rd Qu.:19.73
## Max. :23.20 Max. :23.50
## NA's :1 NA's :1
## Pulse Clap Exer Smoke Height
## Min. : 35.00 Left : 39 Freq:115 Heavy: 11 Min. :150.0
## 1st Qu.: 66.00 Neither: 50 None: 24 Never:189 1st Qu.:165.0
## Median : 72.50 Right :147 Some: 98 Occas: 19 Median :171.0
## Mean : 74.15 NA's : 1 Regul: 17 Mean :172.4
## 3rd Qu.: 80.00 NA's : 1 3rd Qu.:180.0
## Max. :104.00 Max. :200.0
## NA's :45 NA's :28
## M.I Age
## Imperial: 68 Min. :16.75
## Metric :141 1st Qu.:17.67
## NA's : 28 Median :18.58
## Mean :20.37
## 3rd Qu.:20.17
## Max. :73.00
##
s1 <- ggplot(survey, aes(Sex)) + geom_bar() + ylab("")
s2 <- ggplot(survey, aes(W.Hnd)) + geom_bar() +
xlab("Writing hand") + ylab("")
s3 <- ggplot(survey, aes(Fold)) + geom_bar() +
xlab("Folding arms: arm on top") + ylab("")
s4 <- ggplot(survey, aes(Clap)) + geom_bar() +
xlab("Clapping: hand on top") + ylab("")
survey <- within(survey,
ExerN <- factor(Exer, levels=c("None", "Some", "Freq")))
s5 <- ggplot(survey, aes(ExerN)) + geom_bar() +
xlab("Exercise") + ylab("")
s6 <- ggplot(survey, aes(M.I)) + geom_bar() +
xlab("Height units") + ylab("")
survey <- within(survey, SmokeN <- factor(Smoke, levels=c("Never", "Occas", "Regul", "Heavy")))
s7 <- ggplot(survey, aes(SmokeN)) + geom_bar() +
xlab("Smoking") + ylab("")
grid.arrange(s1, s2, s3, s4, s5, s6, s7, ncol=3)
s1 <- ggplot(subset(survey, !is.na(Sex)), aes(Sex)) + geom_bar() + ylab("")
grid.arrange(s1, s2, s3, s4, s5, s6, s7, ncol=3)
summary(survey$W.Hnd)
## Left Right NA's
## 18 218 1
# Goles en futbol
data(UKSoccer, package="vcd")
class(UKSoccer)
## [1] "table"
str(UKSoccer)
## table [1:5, 1:5] 27 59 28 19 7 29 53 32 14 8 ...
## - attr(*, "dimnames")=List of 2
## ..$ Home: chr [1:5] "0" "1" "2" "3" ...
## ..$ Away: chr [1:5] "0" "1" "2" "3" ...
glimpse(UKSoccer)
## table [1:5, 1:5] 27 59 28 19 7 29 53 32 14 8 ...
## - attr(*, "dimnames")=List of 2
## ..$ Home: chr [1:5] "0" "1" "2" "3" ...
## ..$ Away: chr [1:5] "0" "1" "2" "3" ...
summary(UKSoccer)
## Number of cases in table: 380
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 18.699, df = 16, p-value = 0.2846
## Chi-squared approximation may be incorrect
PL <- data.frame(UKSoccer)
lx <- c("0", "1", "2", "3", "4 or more")
b1 <- ggplot(PL, aes(x=factor(Home), weight=Freq)) +
geom_bar(fill="firebrick1") +
ylab("") + xlab("Home Goals") +
scale_x_discrete(labels=lx) + ylim(0,150)
b2 <- ggplot(PL, aes(x=factor(Away), weight=Freq)) +
geom_bar(fill="cyan1") +
ylab("") + xlab("Away Goals") +
scale_x_discrete(labels=lx) + ylim(0,150)
grid.arrange(b1, b2, nrow=1)
# Benford's Law
xx <- 1:9
Ben <- data.frame(xx, pdf=log10(1+1/xx))
ggplot(Ben, aes(factor(xx), weight=pdf)) + geom_bar() +
xlab("") + ylab("") + ylim(0,0.35)
library(VGAMdata)
data(oly12, package="VGAMdata")
class(oly12)
## [1] "data.frame"
str(oly12)
## 'data.frame': 10384 obs. of 14 variables:
## $ Name : Factor w/ 10366 levels "Aaron Brown",..: 5353 121 4117 16 6033 5686 6061 6765 2738 3854 ...
## $ Country: Factor w/ 205 levels "Afghanistan",..: 144 195 68 125 154 68 8 125 94 3 ...
## $ Age : int 23 33 30 24 26 27 30 23 27 19 ...
## $ Height : num 1.7 1.93 1.87 NA 1.78 1.82 1.82 1.87 1.9 1.7 ...
## $ Weight : int 60 125 76 NA 85 80 73 75 80 NA ...
## $ Sex : Factor w/ 2 levels "F","M": 2 2 2 2 1 2 1 2 2 2 ...
## $ DOB : Date, format: "1989-02-06" NA ...
## $ PlaceOB: Factor w/ 4108 levels "","Aachen (GER)",..: 2486 3302 398 48 3436 1 1 1 1172 2266 ...
## $ Gold : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Silver : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Bronze : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Total : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Sport : Factor w/ 42 levels "Archery","Athletics",..: 27 2 2 7 2 25 29 22 2 7 ...
## $ Event : Factor w/ 763 levels "Group All-Around",..: 350 405 251 443 699 406 726 403 248 491 ...
glimpse(oly12)
## Observations: 10,384
## Variables: 14
## $ Name <fct> Lamusi A, A G Kruger, Jamale Aarrass, Abdelhak Aatakni...
## $ Country <fct> People's Republic of China, United States of America, ...
## $ Age <int> 23, 33, 30, 24, 26, 27, 30, 23, 27, 19, 37, 28, 28, 28...
## $ Height <dbl> 1.70, 1.93, 1.87, NA, 1.78, 1.82, 1.82, 1.87, 1.90, 1....
## $ Weight <int> 60, 125, 76, NA, 85, 80, 73, 75, 80, NA, NA, NA, 60, 6...
## $ Sex <fct> M, M, M, M, F, M, F, M, M, M, M, M, F, F, M, F, M, M, ...
## $ DOB <date> 1989-02-06, NA, NA, 1988-09-02, NA, 1984-06-09, NA, 1...
## $ PlaceOB <fct> NEIMONGGOL (CHN), Sheldon (USA), BEZONS (FRA), AIN SEB...
## $ Gold <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Silver <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Bronze <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Total <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ Sport <fct> Judo, Athletics, Athletics, Boxing, Athletics, Handbal...
## $ Event <fct> Men's -60kg, Men's Hammer Throw, Men's 1500m, Men's Li...
summary(oly12)
## Name Country
## Lei Zhang : 3 Great Britain : 523
## Ling Li : 3 United States of America : 518
## Aleksandar Aleksandrov: 2 Russian Federation : 414
## Hao Wang : 2 Australia : 399
## Ivan Zaytsev : 2 Germany : 385
## Maria Gromova : 2 People's Republic of China: 337
## (Other) :10370 (Other) :7808
## Age Height Weight Sex
## Min. :13.00 Min. :1.320 Min. : 36.00 F:4628
## 1st Qu.:22.00 1st Qu.:1.690 1st Qu.: 61.00 M:5756
## Median :25.00 Median :1.770 Median : 70.00
## Mean :26.07 Mean :1.769 Mean : 72.85
## 3rd Qu.:29.00 3rd Qu.:1.850 3rd Qu.: 81.00
## Max. :71.00 Max. :2.210 Max. :218.00
## NA's :561 NA's :1280
## DOB PlaceOB Gold
## Min. :1947-06-01 :2690 Min. :0.00000
## 1st Qu.:1983-01-02 Seoul (KOR) : 57 1st Qu.:0.00000
## Median :1986-09-11 Budapest (HUN) : 54 Median :0.00000
## Mean :1986-01-09 Moscow (RUS) : 50 Mean :0.01666
## 3rd Qu.:1989-11-10 Shandong (CHN) : 37 3rd Qu.:0.00000
## Max. :1997-07-09 Buenos Aires (ARG): 31 Max. :2.00000
## NA's :6192 (Other) :7465
## Silver Bronze Total Sport
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Athletics:2119
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 Swimming : 907
## Median :0.00000 Median :0.00000 Median :0.0000 Football : 596
## Mean :0.01705 Mean :0.01849 Mean :0.0522 Rowing : 524
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 Hockey : 416
## Max. :2.00000 Max. :2.00000 Max. :5.0000 Judo : 368
## (Other) :5454
## Event
## Men's Football : 336
## Women's Football: 260
## Women's Hockey : 210
## Men's Hockey : 206
## Men's Handball : 160
## Women's Handball: 159
## (Other) :9053
ggplot(oly12, aes(Height, Weight)) + geom_point() +
ggtitle("Athletes at the London Olympics 2012")
## Warning: Removed 1346 rows containing missing values (geom_point).
library(HistData)
data(DrinksWages, package="HistData")
class(DrinksWages)
## [1] "data.frame"
str(DrinksWages)
## 'data.frame': 70 obs. of 6 variables:
## $ class : Factor w/ 3 levels "A","B","C": 1 1 1 1 1 1 1 1 1 1 ...
## $ trade : Factor w/ 70 levels "baker","barman",..: 38 10 25 55 36 44 68 34 14 11 ...
## $ sober : int 1 1 2 1 2 9 8 3 0 12 ...
## $ drinks: int 1 10 1 5 0 8 2 5 7 23 ...
## $ wage : num 24 18.4 21.5 21.2 19 ...
## $ n : int 2 11 3 6 2 17 10 8 7 35 ...
glimpse(DrinksWages)
## Observations: 70
## Variables: 6
## $ class <fct> A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A...
## $ trade <fct> papercutter, cabmen, goldbeater, stablemen, millworker,...
## $ sober <int> 1, 1, 2, 1, 2, 9, 8, 3, 0, 12, 1, 2, 11, 2, 1, 1, 1, 1,...
## $ drinks <int> 1, 10, 1, 5, 0, 8, 2, 5, 7, 23, 0, 4, 9, 2, 1, 2, 0, 0,...
## $ wage <dbl> 24.00000, 18.41667, 21.50000, 21.16667, 19.00000, 20.50...
## $ n <int> 2, 11, 3, 6, 2, 17, 10, 8, 7, 35, 1, 6, 20, 4, 2, 3, 1,...
summary(DrinksWages)
## class trade sober drinks wage
## A:25 baker : 1 Min. : 0.00 Min. : 0.000 Min. :12.00
## B:22 barman : 1 1st Qu.: 1.00 1st Qu.: 0.250 1st Qu.:22.83
## C:23 billposter: 1 Median : 2.00 Median : 2.000 Median :27.00
## blacksmith: 1 Mean : 3.90 Mean : 4.729 Mean :27.03
## bookbinder: 1 3rd Qu.: 3.75 3rd Qu.: 5.000 3rd Qu.:31.08
## bottler : 1 Max. :71.00 Max. :85.000 Max. :40.00
## (Other) :64
## n
## Min. : 1.000
## 1st Qu.: 2.000
## Median : 3.500
## Mean : 8.629
## 3rd Qu.: 8.000
## Max. :156.000
##
ggplot(DrinksWages, aes(drinks/n, wage)) + geom_point() +
xlab("Proportion of drinkers") + xlim(0,1) + ylim(0,40)
with(DrinksWages, hist(n, breaks=0:max(n)))
with(DrinksWages, table(n))
## n
## 1 2 3 4 5 6 7 8 9 10 11 12 17 19 20 21 22 29
## 16 10 9 6 2 4 2 4 4 1 1 2 1 1 1 1 1 1
## 35 37 156
## 1 1 1
with(DrinksWages, max(n[drinks==0]))
## [1] 5
with(DrinksWages, trade[drinks==0 & n==max(n[drinks==0])])
## [1] gasworker
## 70 Levels: baker barman billposter blacksmith bookbinder ... wireworker
with(DrinksWages, max(n[sober==0]))
## [1] 7
with(DrinksWages, trade[sober==0 & n==max(n[sober==0])])
## [1] chimneysweep
## 70 Levels: baker barman billposter blacksmith bookbinder ... wireworker
bigDW <- filter(DrinksWages, n > 4)
ggplot(bigDW, aes(drinks/n, wage)) + geom_point() +
xlab("Proportion of drinkers") + xlim(0,1) + ylim(0,40)
data(geyser, package="MASS")
class(geyser)
## [1] "data.frame"
str(geyser)
## 'data.frame': 299 obs. of 2 variables:
## $ waiting : num 80 71 57 80 75 77 60 86 77 56 ...
## $ duration: num 4.02 2.15 4 4 4 ...
glimpse(geyser)
## Observations: 299
## Variables: 2
## $ waiting <dbl> 80, 71, 57, 80, 75, 77, 60, 86, 77, 56, 81, 50, 89, 5...
## $ duration <dbl> 4.016667, 2.150000, 4.000000, 4.000000, 4.000000, 2.0...
summary(geyser)
## waiting duration
## Min. : 43.00 Min. :0.8333
## 1st Qu.: 59.00 1st Qu.:2.0000
## Median : 76.00 Median :4.0000
## Mean : 72.31 Mean :3.4608
## 3rd Qu.: 83.00 3rd Qu.:4.3833
## Max. :108.00 Max. :5.4500
ggplot(geyser, aes(duration, waiting)) + geom_point()
ggplot(geyser, aes(duration, waiting)) + geom_point() +
geom_density2d()
library(hdrcde)
## This is hdrcde 3.2
par(mfrow=c(1,1))
par(mar=c(3.1, 4.1, 1.1, 2.1))
with(geyser, hdr.boxplot.2d(duration, waiting,
show.points=TRUE, prob=c(0.01,0.05,0.5,0.75)))
with(geyser, hdr.boxplot.2d(duration, waiting,
show.points=TRUE, prob=c(0.01)))
data(Cars93, package="MASS")
class(Cars93)
## [1] "data.frame"
str(Cars93)
## 'data.frame': 93 obs. of 27 variables:
## $ Manufacturer : Factor w/ 32 levels "Acura","Audi",..: 1 1 2 2 3 4 4 4 4 5 ...
## $ Model : Factor w/ 93 levels "100","190E","240",..: 49 56 9 1 6 24 54 74 73 35 ...
## $ Type : Factor w/ 6 levels "Compact","Large",..: 4 3 1 3 3 3 2 2 3 2 ...
## $ Min.Price : num 12.9 29.2 25.9 30.8 23.7 14.2 19.9 22.6 26.3 33 ...
## $ Price : num 15.9 33.9 29.1 37.7 30 15.7 20.8 23.7 26.3 34.7 ...
## $ Max.Price : num 18.8 38.7 32.3 44.6 36.2 17.3 21.7 24.9 26.3 36.3 ...
## $ MPG.city : int 25 18 20 19 22 22 19 16 19 16 ...
## $ MPG.highway : int 31 25 26 26 30 31 28 25 27 25 ...
## $ AirBags : Factor w/ 3 levels "Driver & Passenger",..: 3 1 2 1 2 2 2 2 2 2 ...
## $ DriveTrain : Factor w/ 3 levels "4WD","Front",..: 2 2 2 2 3 2 2 3 2 2 ...
## $ Cylinders : Factor w/ 6 levels "3","4","5","6",..: 2 4 4 4 2 2 4 4 4 5 ...
## $ EngineSize : num 1.8 3.2 2.8 2.8 3.5 2.2 3.8 5.7 3.8 4.9 ...
## $ Horsepower : int 140 200 172 172 208 110 170 180 170 200 ...
## $ RPM : int 6300 5500 5500 5500 5700 5200 4800 4000 4800 4100 ...
## $ Rev.per.mile : int 2890 2335 2280 2535 2545 2565 1570 1320 1690 1510 ...
## $ Man.trans.avail : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 1 1 1 1 1 ...
## $ Fuel.tank.capacity: num 13.2 18 16.9 21.1 21.1 16.4 18 23 18.8 18 ...
## $ Passengers : int 5 5 5 6 4 6 6 6 5 6 ...
## $ Length : int 177 195 180 193 186 189 200 216 198 206 ...
## $ Wheelbase : int 102 115 102 106 109 105 111 116 108 114 ...
## $ Width : int 68 71 67 70 69 69 74 78 73 73 ...
## $ Turn.circle : int 37 38 37 37 39 41 42 45 41 43 ...
## $ Rear.seat.room : num 26.5 30 28 31 27 28 30.5 30.5 26.5 35 ...
## $ Luggage.room : int 11 15 14 17 13 16 17 21 14 18 ...
## $ Weight : int 2705 3560 3375 3405 3640 2880 3470 4105 3495 3620 ...
## $ Origin : Factor w/ 2 levels "USA","non-USA": 2 2 2 2 2 1 1 1 1 1 ...
## $ Make : Factor w/ 93 levels "Acura Integra",..: 1 2 4 3 5 6 7 9 8 10 ...
glimpse(Cars93)
## Observations: 93
## Variables: 27
## $ Manufacturer <fct> Acura, Acura, Audi, Audi, BMW, Buick, Buick...
## $ Model <fct> Integra, Legend, 90, 100, 535i, Century, Le...
## $ Type <fct> Small, Midsize, Compact, Midsize, Midsize, ...
## $ Min.Price <dbl> 12.9, 29.2, 25.9, 30.8, 23.7, 14.2, 19.9, 2...
## $ Price <dbl> 15.9, 33.9, 29.1, 37.7, 30.0, 15.7, 20.8, 2...
## $ Max.Price <dbl> 18.8, 38.7, 32.3, 44.6, 36.2, 17.3, 21.7, 2...
## $ MPG.city <int> 25, 18, 20, 19, 22, 22, 19, 16, 19, 16, 16,...
## $ MPG.highway <int> 31, 25, 26, 26, 30, 31, 28, 25, 27, 25, 25,...
## $ AirBags <fct> None, Driver & Passenger, Driver only, Driv...
## $ DriveTrain <fct> Front, Front, Front, Front, Rear, Front, Fr...
## $ Cylinders <fct> 4, 6, 6, 6, 4, 4, 6, 6, 6, 8, 8, 4, 4, 6, 4...
## $ EngineSize <dbl> 1.8, 3.2, 2.8, 2.8, 3.5, 2.2, 3.8, 5.7, 3.8...
## $ Horsepower <int> 140, 200, 172, 172, 208, 110, 170, 180, 170...
## $ RPM <int> 6300, 5500, 5500, 5500, 5700, 5200, 4800, 4...
## $ Rev.per.mile <int> 2890, 2335, 2280, 2535, 2545, 2565, 1570, 1...
## $ Man.trans.avail <fct> Yes, Yes, Yes, Yes, Yes, No, No, No, No, No...
## $ Fuel.tank.capacity <dbl> 13.2, 18.0, 16.9, 21.1, 21.1, 16.4, 18.0, 2...
## $ Passengers <int> 5, 5, 5, 6, 4, 6, 6, 6, 5, 6, 5, 5, 5, 4, 6...
## $ Length <int> 177, 195, 180, 193, 186, 189, 200, 216, 198...
## $ Wheelbase <int> 102, 115, 102, 106, 109, 105, 111, 116, 108...
## $ Width <int> 68, 71, 67, 70, 69, 69, 74, 78, 73, 73, 74,...
## $ Turn.circle <int> 37, 38, 37, 37, 39, 41, 42, 45, 41, 43, 44,...
## $ Rear.seat.room <dbl> 26.5, 30.0, 28.0, 31.0, 27.0, 28.0, 30.5, 3...
## $ Luggage.room <int> 11, 15, 14, 17, 13, 16, 17, 21, 14, 18, 14,...
## $ Weight <int> 2705, 3560, 3375, 3405, 3640, 2880, 3470, 4...
## $ Origin <fct> non-USA, non-USA, non-USA, non-USA, non-USA...
## $ Make <fct> Acura Integra, Acura Legend, Audi 90, Audi ...
summary(Cars93)
## Manufacturer Model Type Min.Price Price
## Chevrolet: 8 100 : 1 Compact:16 Min. : 6.70 Min. : 7.40
## Ford : 8 190E : 1 Large :11 1st Qu.:10.80 1st Qu.:12.20
## Dodge : 6 240 : 1 Midsize:22 Median :14.70 Median :17.70
## Mazda : 5 300E : 1 Small :21 Mean :17.13 Mean :19.51
## Pontiac : 5 323 : 1 Sporty :14 3rd Qu.:20.30 3rd Qu.:23.30
## Buick : 4 535i : 1 Van : 9 Max. :45.40 Max. :61.90
## (Other) :57 (Other):87
## Max.Price MPG.city MPG.highway AirBags
## Min. : 7.9 Min. :15.00 Min. :20.00 Driver & Passenger:16
## 1st Qu.:14.7 1st Qu.:18.00 1st Qu.:26.00 Driver only :43
## Median :19.6 Median :21.00 Median :28.00 None :34
## Mean :21.9 Mean :22.37 Mean :29.09
## 3rd Qu.:25.3 3rd Qu.:25.00 3rd Qu.:31.00
## Max. :80.0 Max. :46.00 Max. :50.00
##
## DriveTrain Cylinders EngineSize Horsepower RPM
## 4WD :10 3 : 3 Min. :1.000 Min. : 55.0 Min. :3800
## Front:67 4 :49 1st Qu.:1.800 1st Qu.:103.0 1st Qu.:4800
## Rear :16 5 : 2 Median :2.400 Median :140.0 Median :5200
## 6 :31 Mean :2.668 Mean :143.8 Mean :5281
## 8 : 7 3rd Qu.:3.300 3rd Qu.:170.0 3rd Qu.:5750
## rotary: 1 Max. :5.700 Max. :300.0 Max. :6500
##
## Rev.per.mile Man.trans.avail Fuel.tank.capacity Passengers
## Min. :1320 No :32 Min. : 9.20 Min. :2.000
## 1st Qu.:1985 Yes:61 1st Qu.:14.50 1st Qu.:4.000
## Median :2340 Median :16.40 Median :5.000
## Mean :2332 Mean :16.66 Mean :5.086
## 3rd Qu.:2565 3rd Qu.:18.80 3rd Qu.:6.000
## Max. :3755 Max. :27.00 Max. :8.000
##
## Length Wheelbase Width Turn.circle
## Min. :141.0 Min. : 90.0 Min. :60.00 Min. :32.00
## 1st Qu.:174.0 1st Qu.: 98.0 1st Qu.:67.00 1st Qu.:37.00
## Median :183.0 Median :103.0 Median :69.00 Median :39.00
## Mean :183.2 Mean :103.9 Mean :69.38 Mean :38.96
## 3rd Qu.:192.0 3rd Qu.:110.0 3rd Qu.:72.00 3rd Qu.:41.00
## Max. :219.0 Max. :119.0 Max. :78.00 Max. :45.00
##
## Rear.seat.room Luggage.room Weight Origin
## Min. :19.00 Min. : 6.00 Min. :1695 USA :48
## 1st Qu.:26.00 1st Qu.:12.00 1st Qu.:2620 non-USA:45
## Median :27.50 Median :14.00 Median :3040
## Mean :27.83 Mean :13.89 Mean :3073
## 3rd Qu.:30.00 3rd Qu.:15.00 3rd Qu.:3525
## Max. :36.00 Max. :22.00 Max. :4105
## NA's :2 NA's :11
## Make
## Acura Integra: 1
## Acura Legend : 1
## Audi 100 : 1
## Audi 90 : 1
## BMW 535i : 1
## Buick Century: 1
## (Other) :87
ggplot(Cars93, aes(Weight, MPG.city)) + geom_point() +
geom_smooth(colour="green") + ylim(0,50)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
data(father.son, package="UsingR")
class(father.son)
## [1] "data.frame"
str(father.son)
## 'data.frame': 1078 obs. of 2 variables:
## $ fheight: num 65 63.3 65 65.8 61.1 ...
## $ sheight: num 59.8 63.2 63.3 62.8 64.3 ...
glimpse(father.son)
## Observations: 1,078
## Variables: 2
## $ fheight <dbl> 65.04851, 63.25094, 64.95532, 65.75250, 61.13723, 63.0...
## $ sheight <dbl> 59.77827, 63.21404, 63.34242, 62.79238, 64.28113, 64.2...
summary(father.son)
## fheight sheight
## Min. :59.01 Min. :58.51
## 1st Qu.:65.79 1st Qu.:66.93
## Median :67.77 Median :68.62
## Mean :67.69 Mean :68.68
## 3rd Qu.:69.60 3rd Qu.:70.47
## Max. :75.43 Max. :78.36
ggplot(father.son, aes(fheight, sheight)) + geom_point() +
geom_smooth(method="lm", colour="red") +
geom_abline(slope=1, intercept=0)
data(father.son, package="UsingR")
m1 <- lm(sheight~fheight, father.son)
summary(m1)
##
## Call:
## lm(formula = sheight ~ fheight, data = father.son)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8772 -1.5144 -0.0079 1.6285 8.9685
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.88660 1.83235 18.49 <2e-16 ***
## fheight 0.51409 0.02705 19.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.437 on 1076 degrees of freedom
## Multiple R-squared: 0.2513, Adjusted R-squared: 0.2506
## F-statistic: 361.2 on 1 and 1076 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(m1)
data(father.son, package="UsingR")
ggplot(father.son, aes(fheight, sheight)) + geom_point() +
geom_smooth(method="lm", colour="red", se=FALSE) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(oly12, aes(Height, Weight)) +
geom_point(size = 1) + facet_wrap(~Sex, ncol=1)
## Warning: Removed 1346 rows containing missing values (geom_point).
oly12S <- within(oly12, Sport <- abbreviate(Sport, 12))
ggplot(oly12S, aes(Height, Weight)) +
geom_point(size = 1) + facet_wrap(~Sport) +
ggtitle("Weight and Height by Sport")
## Warning: Removed 1346 rows containing missing values (geom_point).
oly12JWW <- filter(oly12, Sport %in%
c("Judo", "Weightlifting", "Wrestling"))
ggplot(oly12JWW, aes(Height, Weight)) +
geom_point(size = 1) + facet_wrap(~Sport) +
ggtitle("Weight and Height by Sport")
## Warning: Removed 69 rows containing missing values (geom_point).
library(GGally)
library(VGAMdata)
data(crime.us, package="VGAMdata")
class(crime.us)
## [1] "data.frame"
str(crime.us)
## 'data.frame': 50 obs. of 22 variables:
## $ State : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ Population : int 4708708 698473 6595778 2889450 36961664 5024748 3518288 885122 18537969 9829211 ...
## $ ViolentCrimeTotal : int 21179 4421 26929 14959 174459 16976 10508 5635 113541 41880 ...
## $ Murder : int 323 22 354 179 1972 175 107 41 1017 566 ...
## $ Rape : int 1504 512 2110 1368 8713 2242 651 338 5501 2301 ...
## $ Robbery : int 6259 655 8099 2582 64093 3387 3990 1671 30911 14603 ...
## $ Assault : int 13093 3232 16366 10830 99681 11172 5760 3585 76112 24410 ...
## $ PropertyCrimeTotal : int 177629 20577 234582 109038 1009614 133968 82181 29648 712010 360400 ...
## $ Burglary : int 48837 3597 53412 34764 230137 26649 15073 6932 181884 98362 ...
## $ LarcenyTheft : int 117711 15291 155184 68171 615456 94861 59632 20809 479867 228893 ...
## $ MotorVehicleTheft : int 11081 1689 25986 6103 164021 12458 7476 1907 50259 33145 ...
## $ ViolentCrimeRate : num 450 633 408 518 472 ...
## $ MurderRate : num 6.9 3.1 5.4 6.2 5.3 3.5 3 4.6 5.5 5.8 ...
## $ RapeRate : num 31.9 73.3 32 47.3 23.6 44.6 18.5 38.2 29.7 23.4 ...
## $ RobberyRate : num 132.9 93.8 122.8 89.4 173.4 ...
## $ AssaultRate : num 278 463 248 375 270 ...
## $ PropertyCrimeRate : num 3772 2946 3556 3774 2732 ...
## $ BurglaryRate : num 1037 515 810 1203 623 ...
## $ LarcenyTheftRate : num 2500 2189 2353 2359 1665 ...
## $ MotorVehicleTheftRate: num 235 242 394 211 444 ...
## $ stateNumber : int 1 2 3 4 5 6 7 8 9 10 ...
## $ abbrev : chr "AL" "AK" "AZ" "AR" ...
glimpse(crime.us)
## Observations: 50
## Variables: 22
## $ State <chr> "Alabama", "Alaska", "Arizona", "Arkansa...
## $ Population <int> 4708708, 698473, 6595778, 2889450, 36961...
## $ ViolentCrimeTotal <int> 21179, 4421, 26929, 14959, 174459, 16976...
## $ Murder <int> 323, 22, 354, 179, 1972, 175, 107, 41, 1...
## $ Rape <int> 1504, 512, 2110, 1368, 8713, 2242, 651, ...
## $ Robbery <int> 6259, 655, 8099, 2582, 64093, 3387, 3990...
## $ Assault <int> 13093, 3232, 16366, 10830, 99681, 11172,...
## $ PropertyCrimeTotal <int> 177629, 20577, 234582, 109038, 1009614, ...
## $ Burglary <int> 48837, 3597, 53412, 34764, 230137, 26649...
## $ LarcenyTheft <int> 117711, 15291, 155184, 68171, 615456, 94...
## $ MotorVehicleTheft <int> 11081, 1689, 25986, 6103, 164021, 12458,...
## $ ViolentCrimeRate <dbl> 449.8, 633.0, 408.3, 517.7, 472.0, 337.8...
## $ MurderRate <dbl> 6.9, 3.1, 5.4, 6.2, 5.3, 3.5, 3.0, 4.6, ...
## $ RapeRate <dbl> 31.9, 73.3, 32.0, 47.3, 23.6, 44.6, 18.5...
## $ RobberyRate <dbl> 132.9, 93.8, 122.8, 89.4, 173.4, 67.4, 1...
## $ AssaultRate <dbl> 278.1, 462.7, 248.1, 374.8, 269.7, 222.3...
## $ PropertyCrimeRate <dbl> 3772.4, 2946.0, 3556.5, 3773.7, 2731.5, ...
## $ BurglaryRate <dbl> 1037.2, 515.0, 809.8, 1203.1, 622.6, 530...
## $ LarcenyTheftRate <dbl> 2499.9, 2189.2, 2352.8, 2359.3, 1665.1, ...
## $ MotorVehicleTheftRate <dbl> 235.3, 241.8, 394.0, 211.2, 443.8, 247.9...
## $ stateNumber <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1...
## $ abbrev <chr> "AL", "AK", "AZ", "AR", "CA", "CO", "CT"...
summary(crime.us)
## State Population ViolentCrimeTotal Murder
## Length:50 Min. : 544270 Min. : 817 Min. : 7.00
## Class :character 1st Qu.: 1802408 1st Qu.: 5456 1st Qu.: 37.75
## Mode :character Median : 4403094 Median : 15968 Median : 176.50
## Mean : 6128138 Mean : 26207 Mean : 301.94
## 3rd Qu.: 6647091 3rd Qu.: 30481 3rd Qu.: 424.25
## Max. :36961664 Max. :174459 Max. :1972.00
## Rape Robbery Assault PropertyCrimeTotal
## Min. : 124.0 Min. : 77 Min. : 575 Min. : 12502
## 1st Qu.: 562.8 1st Qu.: 1201 1st Qu.: 3610 1st Qu.: 47968
## Median :1263.5 Median : 3810 Median :10297 Median : 132868
## Mean :1758.9 Mean : 8077 Mean :16069 Mean : 185850
## 3rd Qu.:2080.8 3rd Qu.: 9260 3rd Qu.:20017 3rd Qu.: 226611
## Max. :8713.0 Max. :64093 Max. :99681 Max. :1009614
## Burglary LarcenyTheft MotorVehicleTheft ViolentCrimeRate
## Min. : 2230 Min. : 9296 Min. : 448 Min. :119.8
## 1st Qu.: 9871 1st Qu.: 34424 1st Qu.: 3583 1st Qu.:255.3
## Median : 29432 Median : 89563 Median : 10136 Median :335.5
## Mean : 43909 Mean :126160 Mean : 15782 Mean :382.0
## 3rd Qu.: 51821 3rd Qu.:153502 3rd Qu.: 17736 3rd Qu.:495.7
## Max. :240233 Max. :678353 Max. :164021 Max. :702.2
## MurderRate RapeRate RobberyRate AssaultRate
## Min. : 0.800 Min. :12.00 Min. : 13.70 Min. : 59.0
## 1st Qu.: 2.525 1st Qu.:26.05 1st Qu.: 65.08 1st Qu.:151.2
## Median : 4.300 Median :31.10 Median : 97.10 Median :213.8
## Mean : 4.276 Mean :32.08 Mean :100.15 Mean :245.5
## 3rd Qu.: 5.875 3rd Qu.:35.23 3rd Qu.:135.35 3rd Qu.:320.3
## Max. :11.800 Max. :73.30 Max. :227.80 Max. :503.4
## PropertyCrimeRate BurglaryRate LarcenyTheftRate MotorVehicleTheftRate
## Min. :1719 Min. : 305.2 Min. :1314 Min. : 72.1
## 1st Qu.:2438 1st Qu.: 484.1 1st Qu.:1728 1st Qu.:149.6
## Median :2892 Median : 636.1 Median :2039 Median :213.0
## Mean :2941 Mean : 678.6 Mean :2039 Mean :222.9
## 3rd Qu.:3569 3rd Qu.: 885.2 3rd Qu.:2345 3rd Qu.:279.4
## Max. :4016 Max. :1203.1 Max. :2737 Max. :468.4
## stateNumber abbrev
## Min. : 1.00 Length:50
## 1st Qu.:13.25 Class :character
## Median :25.50 Mode :character
## Mean :25.50
## 3rd Qu.:37.75
## Max. :50.00
crime.usR <- crime.us
names(crime.usR) <- gsub("*Rate", "", names(crime.usR))
names(crime.usR)[19:20] <- c("Larceny", "MotorVTheft")
ggpairs(crime.usR[, c(13:16, 18:20)],
title="Crime rates in the USA",
upper = list(contious='smooth_loess'),
diag=list(continuous='densityDiag'), axisLabels='none')
# niveles de correlación por color
class(USJudgeRatings)
## [1] "data.frame"
str(USJudgeRatings)
## 'data.frame': 43 obs. of 12 variables:
## $ CONT: num 5.7 6.8 7.2 6.8 7.3 6.2 10.6 7 7.3 8.2 ...
## $ INTG: num 7.9 8.9 8.1 8.8 6.4 8.8 9 5.9 8.9 7.9 ...
## $ DMNR: num 7.7 8.8 7.8 8.5 4.3 8.7 8.9 4.9 8.9 6.7 ...
## $ DILG: num 7.3 8.5 7.8 8.8 6.5 8.5 8.7 5.1 8.7 8.1 ...
## $ CFMG: num 7.1 7.8 7.5 8.3 6 7.9 8.5 5.4 8.6 7.9 ...
## $ DECI: num 7.4 8.1 7.6 8.5 6.2 8 8.5 5.9 8.5 8 ...
## $ PREP: num 7.1 8 7.5 8.7 5.7 8.1 8.5 4.8 8.4 7.9 ...
## $ FAMI: num 7.1 8 7.5 8.7 5.7 8 8.5 5.1 8.4 8.1 ...
## $ ORAL: num 7.1 7.8 7.3 8.4 5.1 8 8.6 4.7 8.4 7.7 ...
## $ WRIT: num 7 7.9 7.4 8.5 5.3 8 8.4 4.9 8.5 7.8 ...
## $ PHYS: num 8.3 8.5 7.9 8.8 5.5 8.6 9.1 6.8 8.8 8.5 ...
## $ RTEN: num 7.8 8.7 7.8 8.7 4.8 8.6 9 5 8.8 7.9 ...
glimpse(USJudgeRatings)
## Observations: 43
## Variables: 12
## $ CONT <dbl> 5.7, 6.8, 7.2, 6.8, 7.3, 6.2, 10.6, 7.0, 7.3, 8.2, 7.0, 6...
## $ INTG <dbl> 7.9, 8.9, 8.1, 8.8, 6.4, 8.8, 9.0, 5.9, 8.9, 7.9, 8.0, 8....
## $ DMNR <dbl> 7.7, 8.8, 7.8, 8.5, 4.3, 8.7, 8.9, 4.9, 8.9, 6.7, 7.6, 7....
## $ DILG <dbl> 7.3, 8.5, 7.8, 8.8, 6.5, 8.5, 8.7, 5.1, 8.7, 8.1, 7.4, 7....
## $ CFMG <dbl> 7.1, 7.8, 7.5, 8.3, 6.0, 7.9, 8.5, 5.4, 8.6, 7.9, 7.3, 7....
## $ DECI <dbl> 7.4, 8.1, 7.6, 8.5, 6.2, 8.0, 8.5, 5.9, 8.5, 8.0, 7.5, 7....
## $ PREP <dbl> 7.1, 8.0, 7.5, 8.7, 5.7, 8.1, 8.5, 4.8, 8.4, 7.9, 7.1, 6....
## $ FAMI <dbl> 7.1, 8.0, 7.5, 8.7, 5.7, 8.0, 8.5, 5.1, 8.4, 8.1, 7.2, 7....
## $ ORAL <dbl> 7.1, 7.8, 7.3, 8.4, 5.1, 8.0, 8.6, 4.7, 8.4, 7.7, 7.1, 7....
## $ WRIT <dbl> 7.0, 7.9, 7.4, 8.5, 5.3, 8.0, 8.4, 4.9, 8.5, 7.8, 7.2, 7....
## $ PHYS <dbl> 8.3, 8.5, 7.9, 8.8, 5.5, 8.6, 9.1, 6.8, 8.8, 8.5, 8.4, 6....
## $ RTEN <dbl> 7.8, 8.7, 7.8, 8.7, 4.8, 8.6, 9.0, 5.0, 8.8, 7.9, 7.7, 7....
summary(USJudgeRatings)
## CONT INTG DMNR DILG
## Min. : 5.700 Min. :5.900 Min. :4.300 Min. :5.100
## 1st Qu.: 6.850 1st Qu.:7.550 1st Qu.:6.900 1st Qu.:7.150
## Median : 7.300 Median :8.100 Median :7.700 Median :7.800
## Mean : 7.437 Mean :8.021 Mean :7.516 Mean :7.693
## 3rd Qu.: 7.900 3rd Qu.:8.550 3rd Qu.:8.350 3rd Qu.:8.450
## Max. :10.600 Max. :9.200 Max. :9.000 Max. :9.000
## CFMG DECI PREP FAMI
## Min. :5.400 Min. :5.700 Min. :4.800 Min. :5.100
## 1st Qu.:7.000 1st Qu.:7.100 1st Qu.:6.900 1st Qu.:6.950
## Median :7.600 Median :7.700 Median :7.700 Median :7.600
## Mean :7.479 Mean :7.565 Mean :7.467 Mean :7.488
## 3rd Qu.:8.050 3rd Qu.:8.150 3rd Qu.:8.200 3rd Qu.:8.250
## Max. :8.700 Max. :8.800 Max. :9.100 Max. :9.100
## ORAL WRIT PHYS RTEN
## Min. :4.700 Min. :4.900 Min. :4.700 Min. :4.800
## 1st Qu.:6.850 1st Qu.:6.900 1st Qu.:7.700 1st Qu.:7.150
## Median :7.500 Median :7.600 Median :8.100 Median :7.800
## Mean :7.293 Mean :7.384 Mean :7.935 Mean :7.602
## 3rd Qu.:8.000 3rd Qu.:8.050 3rd Qu.:8.500 3rd Qu.:8.250
## Max. :8.900 Max. :9.000 Max. :9.100 Max. :9.200
library(gclus)
## Loading required package: cluster
judge.cor <- cor(USJudgeRatings)
judge.color <- dmat.color(judge.cor)
cpairs(USJudgeRatings, panel.colors=judge.color,
pch=".", gap=.5)
library(gpairs)
data(Leaves)
gpairs(Leaves[1:10], lower.pars=list(scatter='loess'))
# Parallel coordinate plot
library(GGally)
library(ggplot2)
ggparcoord(iris, columns = 1:4, groupColumn = "Species")
iris1 <- iris
names(iris1) <- c(abbreviate(names(iris)[1:4]), "Species")
summary(iris1)
## Sp.L Sp.W Pt.L Pt.W
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
a1 <- ggparcoord(iris1, columns = 1:4, alphaLines = 0.7, groupColumn = "Species") + xlab("") + ylab("") + ggtitle("a1")
a2 <- ggparcoord(iris1, columns = 1:4, scale="uniminmax", alphaLines=0.7, groupColumn = "Species") + xlab("") + ylab("") + ggtitle("a2")
a3 <- ggparcoord(iris1, columns = 1:4, scale="globalminmax", alphaLines=0.7, groupColumn = "Species") + xlab("") + ylab("") + ggtitle("a3")
a4 <- ggparcoord(iris1, columns = 1:4, scale="center", scaleSummary="median", alphaLines=0.7, groupColumn = "Species") + xlab("") + ylab("") + ggtitle("a4")
gridExtra::grid.arrange(a1, a2, a3, a4)
# Mosaico
titanic <- as.data.frame(Titanic)
par(mfrow=c(2,2), mar= c(4, 4, 0.1, 0.1))
mosaicplot(xtabs(Freq ~ Survived, data=titanic), main="")
mosaicplot(xtabs(Freq ~ Survived + Sex, data=titanic), main="")
mosaicplot(xtabs(Freq ~ Survived + Sex + Class, data=titanic), main="")
mosaicplot(xtabs(Freq ~ Survived + Sex + Class + Age, data=titanic), main="")
pairs(xtabs(Freq ~ ., data=titanic))
ggplot(titanic, aes(Survived, Freq, fill=Sex)) +
geom_bar(stat = "identity") +
facet_grid(Class ~ Sex + Age) + theme(legend.position="none")
## Nota que estamos usando Titanic y no titanic!
extracat::fluctile(Titanic)
extracat::rmb(formula = ~Sex+Class+Age+Survived, data=titanic, cat.ord=2, spine=TRUE, freq.trans="const")
#UCBA Admissions
library(vcd)
## Loading required package: grid
ucb <- data.frame(UCBAdmissions)
class(ucb)
## [1] "data.frame"
str(ucb)
## 'data.frame': 24 obs. of 4 variables:
## $ Admit : Factor w/ 2 levels "Admitted","Rejected": 1 2 1 2 1 2 1 2 1 2 ...
## $ Gender: Factor w/ 2 levels "Male","Female": 1 1 2 2 1 1 2 2 1 1 ...
## $ Dept : Factor w/ 6 levels "A","B","C","D",..: 1 1 1 1 2 2 2 2 3 3 ...
## $ Freq : num 512 313 89 19 353 207 17 8 120 205 ...
glimpse(ucb)
## Observations: 24
## Variables: 4
## $ Admit <fct> Admitted, Rejected, Admitted, Rejected, Admitted, Rejec...
## $ Gender <fct> Male, Male, Female, Female, Male, Male, Female, Female,...
## $ Dept <fct> A, A, A, A, B, B, B, B, C, C, C, C, D, D, D, D, E, E, E...
## $ Freq <dbl> 512, 313, 89, 19, 353, 207, 17, 8, 120, 205, 202, 391, ...
summary(ucb)
## Admit Gender Dept Freq
## Admitted:12 Male :12 A:4 Min. : 8.0
## Rejected:12 Female:12 B:4 1st Qu.: 80.0
## C:4 Median :170.0
## D:4 Mean :188.6
## E:4 3rd Qu.:302.5
## F:4 Max. :512.0
ucb <- within(ucb, Accept <-
factor(Admit, levels=c("Rejected", "Admitted")))
doubledecker(xtabs(Freq~ Dept + Gender + Accept, data = ucb),
gp = gpar(fill = c("grey90", "steelblue")))